{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Huggingface \n",
    "\n",
    "`Huggingface`提供了两种方式调用LLM\n",
    "1. 通过Api token 的方式\n",
    "2. 本地加载\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 安装环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install langchain huggingface_hub transformers sentence_transformers accelerate bitsandbytes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用API  token 调用LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from getpass import getpass\n",
    "\n",
    "HUGGINGFACEHUB_API_TOKEN = getpass()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ[\"HUGGINGFACEHUB_API_TOKEN\"] = HUGGINGFACEHUB_API_TOKEN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.llms import HuggingFaceHub\n",
    "from langchain.chains import LLMChain\n",
    "from langchain.prompts import PromptTemplate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 创建prompt 模板\n",
    "question = \"Where is the capital of China? \"\n",
    "\n",
    "template = \"\"\"Question: {question}\n",
    "\n",
    "Answer: Let's think step by step.\"\"\"\n",
    "\n",
    "prompt = PromptTemplate(template=template, input_variables=[\"question\" ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "repo_id = \"google/flan-t5-base\"  # 具体可以参考 https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = HuggingFaceHub(\n",
    "    repo_id=repo_id, \n",
    ")\n",
    "llm_chain = LLMChain(prompt=prompt, llm=llm  , llm_kwargs = {\"temperature\":0, \"max_length\":512})\n",
    "\n",
    "print(llm_chain.run(question))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 构建RAG检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install pypdf  faiss-cpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import PyPDFLoader\n",
    "\n",
    "###加载文件\n",
    "loader = PyPDFLoader(\"data//baichuan.pdf\")\n",
    "pages = loader.load()\n",
    "\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "###文本切分\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 300,chunk_overlap = 50,)\n",
    "\n",
    "docs = text_splitter.split_documents(pages[:4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
    "from langchain_community.vectorstores import FAISS\n",
    "\n",
    "\n",
    "embeddings = HuggingFaceInferenceAPIEmbeddings(\n",
    "    api_key=HUGGINGFACEHUB_API_TOKEN, model_name=\"sentence-transformers/all-MiniLM-l6-v2\"\n",
    ")\n",
    "\n",
    "db = FAISS.from_documents(docs, embeddings)\n",
    "\n",
    "query = \"How large is the baichuan2 vocabulary size?\"\n",
    "result_simi = db.similarity_search(query , k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "source_knowledge = \"\\n\".join([x.page_content for x in result_simi])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "augmented_prompt = \"\"\"Using the contexts below, answer the query.\n",
    "\n",
    "contexts:\n",
    "{source_knowledge}\n",
    "\n",
    "query: {query}\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "125,696\n"
     ]
    }
   ],
   "source": [
    "prompt = PromptTemplate(template=augmented_prompt, input_variables=[\"source_knowledge\" ,\"query\"])\n",
    "\n",
    "\n",
    "llm_chain = LLMChain(prompt=prompt, llm=llm  , llm_kwargs = {\"temperature\":0, \"max_length\":1024})\n",
    "\n",
    "print(llm_chain.run( {\"source_knowledge\":source_knowledge ,\"query\" : query }))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "augmented_prompt_2 = f\"\"\"Using the contexts below, answer the query.\n",
    "\n",
    "contexts:\n",
    "{source_knowledge}\n",
    "\n",
    "query: {query}\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using the contexts below, answer the query.\n",
      "\n",
      "contexts:\n",
      "have taken both these aspects into account. We\n",
      "have expanded the vocabulary size from 64,000\n",
      "in Baichuan 1 to 125,696, aiming to strike a\n",
      "balance between computational efficiency and\n",
      "model performance.\n",
      "Tokenizer V ocab Size Compression Rate ↓\n",
      "LLaMA 2 32,000 1.037\n",
      "Bloom 250,680 0.501\n",
      "improve after training on more than 2.6 trillion\n",
      "tokens. By sharing these intermediary results,\n",
      "we hope to provide the community with greater\n",
      "insight into the training dynamics of Baichuan 2.\n",
      "Understanding these dynamics is key to unraveling\n",
      "the inner working mechanism of large language\n",
      "Baichuan 2: Open Large-scale Language Models\n",
      "Aiyuan Yang, Bin Xiao, Bingning Wang, Borong Zhang, Chao Yin, Chenxu Lv, Da Pan\n",
      "Dian Wang, Dong Yan, Fan Yang, Fei Deng, Feng Wang, Feng Liu, Guangwei Ai\n",
      "Guosheng Dong, Haizhou Zhao, Hang Xu, Haoze Sun, Hongda Zhang, Hui Liu, Jiaming Ji\n",
      "\n",
      "query: How large is the baichuan2 vocabulary size?\n"
     ]
    }
   ],
   "source": [
    "print(augmented_prompt_2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 本地加载LLM\n",
    "\n",
    "- baichuan model 为例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from transformers.generation.utils import GenerationConfig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from modelscope import snapshot_download, Model\n",
    "model_dir = snapshot_download(\"baichuan-inc/Baichuan2-7B-Chat\", revision='master')\n",
    "model = Model.from_pretrained(model_dir, device_map=\"auto\", trust_remote_code=True, torch_dtype=torch.float16)\n",
    "messages = []\n",
    "messages.append({\"role\": \"user\", \"content\": \"讲解一下“温故而知新”\"})\n",
    "response = model(messages)\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content = '''Using the contexts below, answer the query.\n",
    "\n",
    "contexts:\n",
    "have taken both these aspects into account. We\n",
    "have expanded the vocabulary size from 64,000\n",
    "in Baichuan 1 to 125,696, aiming to strike a\n",
    "balance between computational efficiency and\n",
    "model performance.\n",
    "Tokenizer V ocab Size Compression Rate ↓\n",
    "LLaMA 2 32,000 1.037\n",
    "Bloom 250,680 0.501\n",
    "improve after training on more than 2.6 trillion\n",
    "tokens. By sharing these intermediary results,\n",
    "we hope to provide the community with greater\n",
    "insight into the training dynamics of Baichuan 2.\n",
    "Understanding these dynamics is key to unraveling\n",
    "the inner working mechanism of large language\n",
    "Baichuan 2: Open Large-scale Language Models\n",
    "Aiyuan Yang, Bin Xiao, Bingning Wang, Borong Zhang, Chao Yin, Chenxu Lv, Da Pan\n",
    "Dian Wang, Dong Yan, Fan Yang, Fei Deng, Feng Wang, Feng Liu, Guangwei Ai\n",
    "Guosheng Dong, Haizhou Zhao, Hang Xu, Haoze Sun, Hongda Zhang, Hui Liu, Jiaming Ji\n",
    "\n",
    "query: How large is the baichuan2 vocabulary size?\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = []\n",
    "messages.append({\"role\": \"user\", \"content\": content})\n",
    "response = model(messages)\n",
    "print(response)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llangchainhf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
